X Inactivation

I'd like to explore the state of genes on the X chromosome and see to what degree the iPSCs reactivate their inactive Xs.


In [51]:
import cPickle
import datetime
import glob
import os
import random
import re
import subprocess

import cdpybio as cpb
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pybedtools as pbt
import scipy
import scipy.stats as stats 
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels as sms

import cardipspy as cpy
import ciepy

%matplotlib inline
%load_ext rpy2.ipython

import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
    pbt.set_tempdir('/frazer01/home/cdeboever/tmp')
    
outdir = os.path.join(ciepy.root, 'output',
                      'x_inactivation')
cpy.makedir(outdir)

private_outdir = os.path.join(ciepy.root, 'private_output',
                              'x_inactivation')
cpy.makedir(private_outdir)


The rpy2.ipython extension is already loaded. To reload it, use:
  %reload_ext rpy2.ipython

In [2]:
sns.set_style('whitegrid')

In [3]:
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rsem_tpm.tsv')
tpm = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'wgs_metadata.tsv')
wgs_meta = pd.read_table(fn, index_col=0)

gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)

genes = pbt.BedTool(cpy.gencode_gene_bed)

fn = os.path.join(ciepy.root, 'output', 'input_data', 'cnvs.tsv')
cnvs = pd.read_table(fn, index_col=0)

In [4]:
fn = os.path.join(ciepy.root, 'output', 'input_data', 
                  'mbased_major_allele_freq.tsv')
maj_af = pd.read_table(fn, index_col=0)

fn = os.path.join(ciepy.root, 'output', 'input_data', 
                  'mbased_p_val_ase.tsv')
ase_pval = pd.read_table(fn, index_col=0)

locus_p = pd.Panel({'major_allele_freq':maj_af, 'p_val_ase':ase_pval})
locus_p = locus_p.swapaxes(0, 2)

snv_fns = glob.glob(os.path.join(ciepy.root, 'private_output', 'input_data', 'mbased_snv',
                                 '*_snv.tsv'))
count_fns = glob.glob(os.path.join(ciepy.root, 'private_output', 'input_data', 'allele_counts',
                                   '*mbased_input.tsv'))

snv_res = {}
for fn in snv_fns:
    snv_res[os.path.split(fn)[1].split('_')[0]] = pd.read_table(fn, index_col=0)
    
count_res = {}
for fn in count_fns:
    count_res[os.path.split(fn)[1].split('_')[0]] = pd.read_table(fn, index_col=0)

snv_p = pd.Panel(snv_res)

In [5]:
# We'll keep female subjects with no CNVs on the X chromosome.
sf = subject_meta[subject_meta.sex == 'F']
meta = sf.merge(rna_meta, left_index=True, right_on='subject_id')
s = set(meta.subject_id) & set(cnvs.ix[cnvs.chr == 'chrX', 'subject_id'])
meta = meta[meta.subject_id.apply(lambda x: x not in s)]

meta = meta.ix[[x for x in snv_p.items if x in meta.index]]

snv_p = snv_p.ix[meta.index]

In [6]:
a = meta.shape[0]
b = len(set(meta.subject_id))
print('Using {} samples from {} female donors.'.format(a, b))


Using 144 samples from 116 female donors.

In [7]:
snv_p = snv_p.ix[meta.index]
locus_p = locus_p.ix[meta.index]

In [8]:
# Filter  and take log.
tpm_f = tpm[meta[meta.sex == 'F'].index]
tpm_f = tpm_f[(tpm_f != 0).sum(axis=1) > 0]
log_tpm = np.log10(tpm_f + 1)
# Mean center.
log_tpm_c = (log_tpm.T - log_tpm.mean(axis=1)).T
# Variance normalize.
log_tpm_n = (log_tpm_c.T / log_tpm_c.std(axis=1)).T

Inactivation for Single Sample

Let's take a look at one sample.


In [9]:
df = locus_p.ix[meta.index[0], :, :].dropna()
x_single = df[gene_info.ix[df.index, 'chrom'] == 'chrX']
notx_single = df[gene_info.ix[df.index, 'chrom'] != 'chrX']

In [10]:
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
ax = axs[0]
x_single.major_allele_freq.hist(ax=ax, bins=np.arange(0.5, 1.05, 0.05))
ax.set_xlim(0.5, 1)
ax.set_title('X chromosome genes')
ax.set_ylabel('Number of genes')
ax.set_xlabel('Major allele frequency')
ax = axs[1]
notx_single.major_allele_freq.hist(ax=ax, bins=np.arange(0.5, 1.05, 0.05))
ax.set_xlim(0.5, 1)
ax.set_title('Autosomal genes')
ax.set_ylabel('Number of genes')
ax.set_xlabel('Major allele frequency')
plt.tight_layout();
#plt.savefig(os.path.join(outdir, 'single_sample_majaf.pdf'))


This figure shows the distribution of gene major allele frequencies for genes on the X chromosome and for genes on the autosomes. We can see that autosomal genes often have a MajAF near 50% with few genes near 100%. However, many X chromosome genes are near 100%. These genes are likely still inactivated.


In [11]:
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
ax = axs[0]
(-np.log10(x_single.p_val_ase + x_single.p_val_ase[x_single.p_val_ase != 0].min())).hist(ax=ax)
ax.set_title('X chromosome genes')
ax.set_ylabel('Number of genes')
ax.set_xlabel('$-\log_{10}$ $p$-value')
ax = axs[1]
(-np.log10(notx_single.p_val_ase + notx_single.p_val_ase[notx_single.p_val_ase != 0].min())).hist(ax=ax)
ax.set_title('Autosomal genes')
ax.set_ylabel('Number of genes')
ax.set_xlabel('$-\log_{10}$ $p$-value')
plt.tight_layout();


We can see that the X chromosome is enriched for genes with strong ASE likely due to incomplete X reactivation.

Inactivation for All Samples

Let's take a look at all female samples.


In [12]:
t = locus_p.ix[:, :, 'major_allele_freq']
x_all = locus_p.ix[:, set(t.index) & set(gene_info[gene_info.chrom == 'chrX'].index), :]
notx_all = locus_p.ix[:, set(t.index) & set(gene_info[gene_info.chrom != 'chrX'].index), :]

NANOG involved in the regulation of Xist (more NANOG expression -> less XIST expression though supposedly this doesn’t affect reactivation). OCT4 and SOX2 also supposed to affect XIST expresssion. cMYC, REX1, and KLF4 affect TSIX (this is all mouse stuff I think) (Minkovsky).


In [13]:
genes_to_plot = ['XIST', 'TSIX', 'NANOG', 'POU5F1', 'SOX2', 'MYC', 'ZFP42', 'KLF4']
t = pd.Series(gene_info.index, index=gene_info.gene_name)
    
exp = log_tpm_n.ix[t[genes_to_plot]].T
exp.columns = genes_to_plot
sns.corrplot(exp);


/frazer01/home/cdeboever/software/anaconda/envs/cie/lib/python2.7/site-packages/seaborn/linearmodels.py:1282: UserWarning: The `corrplot` function has been deprecated in favor of `heatmap` and will be removed in a forthcoming release. Please update your code.
  warnings.warn(("The `corrplot` function has been deprecated in favor "
/frazer01/home/cdeboever/software/anaconda/envs/cie/lib/python2.7/site-packages/seaborn/linearmodels.py:1348: UserWarning: The `symmatplot` function has been deprecated in favor of `heatmap` and will be removed in a forthcoming release. Please update your code.
  warnings.warn(("The `symmatplot` function has been deprecated in favor "

In [14]:
genes_to_plot = ['XIST', 'TSIX']
t = pd.Series(gene_info.index, index=gene_info.gene_name)
    
exp = log_tpm_n.ix[t[genes_to_plot]].T
exp.columns = genes_to_plot
# exp = log_tpm_n.ix[[gene_info[gene_info.gene_name == 'XIST'].index[0], 
#                     gene_info[gene_info.gene_name == 'TSIX'].index[0]]].T
# exp.columns = ['XIST', 'TSIX']
exp = exp.ix[x_all.items].sort_values(by='XIST', ascending=False)

fig = plt.figure(figsize=(12, 4))
gs = gridspec.GridSpec(1, 4, width_ratios=[0.5, 1.5, 3, 3])

ax = plt.subplot(gs[0])
sns.heatmap(np.array([meta.ix[exp.index, 'passage'].values]).T, 
            xticklabels=False, yticklabels=False, ax=ax)
ax.set_ylabel('')
ax.set_title('Passage')

ax = plt.subplot(gs[1])
sns.heatmap(exp, yticklabels=False, ax=ax)
ax.set_ylabel('')
ax.set_title('Expression')
#for t in ax.get_xticklabels():
#    t.set_rotation(30)

ax = plt.subplot(gs[2])
r = x_all.ix[:, :, 'major_allele_freq'].apply(lambda z: pd.cut(z[z.isnull() == False], 
                                                       bins=np.arange(0.5, 1.05, 0.05)))
r = r.apply(lambda z: z.value_counts())
sns.heatmap(r.ix[exp.index], yticklabels=False, ax=ax)
xmin,xmax = ax.get_xlim()
ax.set_xticks(np.arange(xmin, xmax + 1, 2))
ax.set_xticklabels(np.arange(0.5, 1.05, 0.1), rotation=30)
ax.set_ylabel('')
ax.set_title('X MajAF distribution')

ax = plt.subplot(gs[3])
r = notx_all.ix[:, :, 'major_allele_freq'].apply(lambda z: pd.cut(z[z.isnull() == False], 
                                                          bins=np.arange(0.5, 1.05, 0.05)))
r = r.apply(lambda z: z.value_counts())
sns.heatmap(r.ix[exp.index], yticklabels=False, ax=ax)
xmin,xmax = ax.get_xlim()
ax.set_xticks(np.arange(xmin, xmax + 1, 2))
ax.set_xticklabels(np.arange(0.5, 1.05, 0.1), rotation=30)
ax.set_ylabel('')
ax.set_title('Autosomal MajAF distribution')

gs.tight_layout(fig)
fig.savefig(os.path.join(outdir, 'x_reactivation_heatmap.png'), dpi=600)


These heatmaps are all aligned by row (so the first row across all heatmaps is the same sample, the second row across all heatmaps is the same sample, etc.). The heatmaps are ordered by each sample's XIST expression (shown in the second heatmap). The X MajAF distribution heatmap is the same as the histogram above for one sample but now we are stacking up the histograms for all samples. We can see that many samples have genes that are inactive although the amount of inactivation varies between samples and is highly correlated with XIST.

The X chromosome is clearly enriched for having ASE which is probably due to incomplete X reactivation.


In [15]:
t = x_all.ix[:, :, 'p_val_ase']
freq = (t < 0.005).sum() / (t.isnull() == False).sum()
print('{:.2f}% of genes per sample have significant ASE on chrX.'.format(freq.mean() * 100))
t = notx_all.ix[:, :, 'p_val_ase']
freq = (t < 0.005).sum() / (t.isnull() == False).sum()
print('{:.2f}% of genes per sample have significant ASE on autosomes.'.format(freq.mean() * 100))


44.58% of genes per sample have significant ASE on chrX.
2.87% of genes per sample have significant ASE on autosomes.

In [16]:
xist_gene_id = gene_info[gene_info.gene_name == 'XIST'].index[0]
tsix_gene_id = gene_info[gene_info.gene_name == 'TSIX'].index[0]

In [17]:
r = stats.spearmanr(meta.passage, log_tpm_n.ix[xist_gene_id, meta.index])
print('Passage and XIST expression are correlated (r={:.2f}) with p={:.3f}.'.format(
    r.correlation, r.pvalue))


Passage and XIST expression are correlated (r=-0.22) with p=0.007.

In [18]:
r = stats.spearmanr(meta.passage, log_tpm_n.ix[tsix_gene_id, meta.index])
print('Passage and TSIX expression are correlated (r={:.2f}) with p={:.3f}.'.format(
    r.correlation, r.pvalue))


Passage and TSIX expression are correlated (r=-0.20) with p=0.018.

In [19]:
percent_ase = ((x_all.ix[:, :, 'p_val_ase'] < 0.005).sum() / 
               (x_all.ix[:, :, 'p_val_ase'].isnull() == False).sum())

In [20]:
r = stats.spearmanr(percent_ase, meta.ix[percent_ase.index, 'passage'])
print('Percent ASE and passage are not correlated (r={:.2f}, p={:.3f}).'.format(
    r.correlation, r.pvalue))


Percent ASE and passage are not correlated (r=-0.06, p=0.449).

In [21]:
r = stats.spearmanr(percent_ase, log_tpm_n.ix[xist_gene_id, percent_ase.index])
print('Percent ASE and XIST expression are correlated (r={:.2f}) with p={:.3e}.'.format(
    r.correlation, r.pvalue))


Percent ASE and XIST expression are correlated (r=0.75) with p=2.299e-27.

In [22]:
r = stats.spearmanr(percent_ase, log_tpm_n.ix[tsix_gene_id, percent_ase.index])
print('Percent ASE and TSIX expression are correlated (r={:.2f}) with p={:.3e}.'.format(
    r.correlation, r.pvalue))


Percent ASE and TSIX expression are correlated (r=0.53) with p=5.179e-12.

X chromosome gene expression

Are reactivated genes expressed more highly?


In [23]:
fn = os.path.join(outdir, 'x_ase_exp.tsv')
if not os.path.exists(fn):
    t = locus_p.ix[:, :, 'p_val_ase']
    t = t.ix[set(t.index) & set(gene_info[gene_info.chrom == 'chrX'].index)]
    t = t[t.isnull().sum(axis=1) <= 0.8 * t.shape[1]]
    t = t.ix[set(log_tpm_n.index) & set(t.index)]

    rows = []
    for i in t.index:
        se = t.ix[i]
        se = se[se.isnull() == False]
        a = log_tpm_n.ix[i, se[se <= 0.005].index]
        b = log_tpm_n.ix[i, se[se > 0.005].index]

        rows.append([se.shape[0], a.shape[0], b.shape[0], a.mean(), b.mean()])

    x_exp = pd.DataFrame(rows, columns=['num_samples', 'num_sig', 'num_not_sig', 
                                        'mean_sig_exp', 'mean_not_sig_exp'],
                         index=t.index)
    x_exp = x_exp[(x_exp.num_sig >= 5) & (x_exp.num_not_sig >= 5)]
    x_exp.to_csv(fn, sep='\t')
else:
    x_exp = pd.read_table(fn, index_col=0)

In [24]:
(x_exp.mean_sig_exp - x_exp.mean_not_sig_exp).hist()
plt.ylabel('Number of genes')
plt.xlabel('Mean expression difference (sig - not sig)')
xmin, xmax = plt.xlim()
plt.xlim(-max(abs(xmin), abs(xmax)), max(abs(xmin), abs(xmax)))
ymin, ymax = plt.ylim()
plt.vlines(0, ymin, ymax);


The histogram above shows that genes have higher average expression in samples where the gene does not have significant ASE.


In [77]:
plt.scatter(x_exp.mean_sig_exp, x_exp.mean_not_sig_exp)
# xmin,xmax = plt.xlim()
# ymin,ymax = plt.ylim()
# plt.plot([-1, 2], [-1, 2])
# plt.xlim(-1, 1.75)
# plt.ylim(-1, 1.75)


Out[77]:
<matplotlib.collections.PathCollection at 0x7f25bfe4ad50>

In [65]:
len(set([x[0] for x in no_sig_ase.index]))


Out[65]:
155

In [64]:
len(set([x[0] for x in sig_ase.index]))


Out[64]:
156

In [60]:
t = locus_p.ix[:, :, 'p_val_ase']
t = t.ix[set(t.index) & set(gene_info[gene_info.chrom == 'chrX'].index)]
t = t[t.isnull().sum(axis=1) <= 0.8 * t.shape[1]]
t = t.ix[set(log_tpm_n.index) & set(t.index)]
exp = log_tpm_n.ix[t.index, t.columns]

no_sig_ase = exp[t > 0.005].stack()
sig_ase = exp[t < 0.005].stack()

pdfs = pd.DataFrame(index=np.arange(-5, 5 + 0.1, 0.1))
density = scipy.stats.gaussian_kde(no_sig_ase)
pdfs['no_ase'] = density(pdfs.index)
density = scipy.stats.gaussian_kde(sig_ase)
pdfs['ase'] = density(pdfs.index)
pdfs.to_csv(os.path.join(outdir, 'expression_densities.tsv'), sep='\t')
pdfs.plot()
plt.ylabel('Density')
plt.xlabel('$\log$ TPM $z$-score');


Reactivation percent

Only some genes show residual inactivation. Are these the same genes across samples?


In [32]:
t = locus_p.ix[:, :, 'p_val_ase']
t = t.ix[set(t.index) & set(gene_info[gene_info.chrom == 'chrX'].index)]
t = t[t.isnull().sum(axis=1) <= 0.8 * t.shape[1]]
freq = (t[t.isnull() == False] < 0.005).sum(axis=1) / (t.isnull() == False).sum(axis=1)

freq.hist()
plt.ylabel('Number of genes')
plt.xlabel('Percentage of samples with ASE ($p$ < 0.005)');



In [33]:
tt = locus_p.ix[:, t.index, 'major_allele_freq']

tt.mean(axis=1).hist()
plt.ylabel('Number of genes')
plt.xlabel('Average major allele frequency per gene');


The above histogram shows what percentage of samples were significant for ASE. Note that a gene in a sample is included here only if it was tested by MBASED for ASE. I restricted to genes that were tested by MBASED in at least 20% of the samples.

Reactivation across the X chromosome

I'd like to look at how genes are reactivated across the X chromosome. For instance, is reactivation correlated with replication timing or L1 density? I can consider things like

  • replication timing
  • distance from centromere or telomere
  • XIST binding motif density (does it have a binding motif?)
  • distance from X inactivation center
  • L1 density
x_all_f = x_all.ix[:, x_all.ix[:, :, 'p_val_ase'].isnull().sum(axis=1) <= 0.8 * x_all.shape[0], :] t = gene_info.ix[x_all_f.major_axis] r = ((t.end - t.start) / 2).astype(int) start = (t.start + r - (((t.end - t.start) / 2 % 1) == 0)).astype(int).astype(str) end = (t.end - r).astype(int).astype(str) s = '\n'.join(t.chrom + '\t' + start + '\t' + end + '\t' + pd.Series(t.index, index=t.index)) + '\n' xgenes_center_bt = pbt.BedTool(s, from_string=True) xgenes_center_bt = xgenes_center_bt.sort()

In [34]:
t = gene_info.ix[x_all.major_axis]
r = ((t.end - t.start) / 2).astype(int)
start = (t.start + r - (((t.end - t.start) / 2 % 1) == 0)).astype(int).astype(str)
end = (t.end - r).astype(int).astype(str)
s = '\n'.join(t.chrom + '\t' + start + '\t' + end + '\t' + 
              pd.Series(t.index, index=t.index)) + '\n'
xgenes_center_bt = pbt.BedTool(s, from_string=True)
xgenes_center_bt = xgenes_center_bt.sort()

Replication timing


In [35]:
# Replication timing data.
rt = pd.read_table('/publicdata/replication_domain_db_20151103/RD_sm300_2936763_hFibiPS4p72.hg19.txt',
                   low_memory=False, skiprows=15, index_col=0)
rt = rt[rt.Chromosome == 'chrX']
s = '\n'.join(rt.Chromosome + '\t' + rt.Start_Position.astype(str) + 
              '\t' + rt.End_Position.astype(str) + '\t' + rt.Data_Value.astype(str))
rt_bt = pbt.BedTool(s, from_string=True)
rt_bt = rt_bt.sort()

In [36]:
res = xgenes_center_bt.closest(rt_bt, d=True)
df = res.to_dataframe()
rt_by_gene = pd.Series(df.thickEnd.values, index=df.name)

L1 elements

I downloaded the repeat masker database from the table browser (group: Repeats, track: RepeatMasker, output format: all fields from selected table) to the file rmsk_db.txt.gz. I also downloaded the database as a bed file from the table browser to the file rmsk.bed. I put both of these in the output directory.


In [37]:
fn = os.path.join(outdir, 'line_one_elements.tsv')
if not os.path.exists(fn):
    rmsk = os.path.join(outdir, 'rmsk_db.txt.gz')
    repeat_db = pd.read_table(rmsk, low_memory=False)
    line_one_elements = repeat_db[repeat_db.repFamily == 'L1']
    line_one_elements = line_one_elements[line_one_elements.genoName == 'chrX']
    line_one_elements.to_csv(fn, sep='\t')
else:
    line_one_elements = pd.read_table(fn, index_col=0)
    
fn = os.path.join(outdir, 'line_one_elements.bed')
if not os.path.exists(fn):
    repeats = pd.read_table(os.path.join(outdir, 'rmsk.bed'), header=None, low_memory=False)
    repeats = repeats[repeats[0] == 'chrX']
    repeat_db = pd.read_table(rmsk, low_memory=False)
    se = pd.Series(dict(zip(repeat_db.repName, repeat_db.repFamily)))
    r = repeats[repeats[3].apply(lambda x: se[x] == 'L1')]
    s = '\n'.join(['\t'.join(x) for x in r.astype(str).values]) + '\n'
    line_one_bt = pbt.BedTool(s, from_string=True)
    line_one_bt = line_one_bt.sort()
    line_one_bt.saveas(fn)
else:
    line_one_bt = pbt.BedTool(fn)
    
res = xgenes_center_bt.window(line_one_bt, w=100000)
df = res.to_dataframe()
line_one_by_gene = df.name.value_counts()

In [38]:
line_one_bedgraph = os.path.join(outdir, 'line_one.bedGraph')
if not os.path.exists(line_one_bedgraph):
    res = line_one_bt.genome_coverage(g=pbt.genome_registry.hg19, bg=True)
    res.saveas(line_one_bedgraph)
    
line_one_bw = os.path.join(outdir, 'line_one.bw')
if not os.path.exists(line_one_bw):
    !bedGraphToBigWig {line_one_bedgraph} /software/bedtools-2.25.0/genomes/human.hg19.genome {line_one_bw}

In [39]:
line_one_bam = os.path.join(outdir, 'line_one_uniq_sorted.bam')
if not os.path.exists(line_one_bam):
    df = line_one_bt.to_dataframe()
    df.name = df.name + '_' + pd.Series(range(df.shape[0])).astype(str)
    fn = os.path.join(outdir, 'line_one_uniq.bed')
    df.to_csv(fn, header=None, index=None, sep='\t')
    out = os.path.join(outdir, 'line_one_uniq.bam')
    !bedToBam -i {fn} -g /frazer01/software/bedtools-2.25.0/genomes/human.hg19.genome > {out}
    !sambamba sort -o {line_one_bam} {out}
    !sambamba index {line_one_bam}
    !rm {out}

In [40]:
fn = os.path.join(outdir, 'alu_elements.tsv')
if not os.path.exists(fn):
    rmsk = os.path.join(outdir, 'rmsk_db.txt.gz')
    repeat_db = pd.read_table(rmsk, low_memory=False)
    alu_elements = repeat_db[repeat_db.repFamily == 'Alu']
    alu_elements.to_csv(fn, sep='\t')
else:
    alu_elements = pd.read_table(fn, index_col=0)
    
fn = os.path.join(outdir, 'alu_elements.bed')
if not os.path.exists(fn):
    repeats = pd.read_table(os.path.join(outdir, 'rmsk.bed'), header=None, low_memory=False)
    repeat_db = pd.read_table(rmsk, low_memory=False)
    se = pd.Series(dict(zip(repeat_db.repName, repeat_db.repFamily)))
    r = repeats[repeats[3].apply(lambda x: se[x] == 'Alu')]
    s = '\n'.join(['\t'.join(x) for x in r.astype(str).values]) + '\n'
    alu_bt = pbt.BedTool(s, from_string=True)
    alu_bt.saveas(fn)
else:
    alu_bt = pbt.BedTool(fn)
    
res = xgenes_center_bt.window(alu_bt, w=100000)
df = res.to_dataframe()
alu_by_gene = df.name.value_counts()

In [41]:
fn = os.path.join(outdir, 'line_two_elements.tsv')
if not os.path.exists(fn):
    rmsk = os.path.join(outdir, 'rmsk_db.txt.gz')
    repeat_db = pd.read_table(rmsk, low_memory=False)
    line_two_elements = repeat_db[repeat_db.repFamily == 'L2']
    line_two_elements.to_csv(fn, sep='\t')
else:
    line_two_elements = pd.read_table(fn, index_col=0)
    
fn = os.path.join(outdir, 'line_two_elements.bed')
if not os.path.exists(fn):
    repeats = pd.read_table(os.path.join(outdir, 'rmsk.bed'), header=None, low_memory=False)
    repeat_db = pd.read_table(rmsk, low_memory=False)
    se = pd.Series(dict(zip(repeat_db.repName, repeat_db.repFamily)))
    r = repeats[repeats[3].apply(lambda x: se[x] == 'L2')]
    s = '\n'.join(['\t'.join(x) for x in r.astype(str).values]) + '\n'
    line_two_bt = pbt.BedTool(s, from_string=True)
    line_two_bt.saveas(fn)
else:
    line_two_bt = pbt.BedTool(fn)
    
res = xgenes_center_bt.window(line_two_bt, w=100000)
df = res.to_dataframe()
line_two_by_gene = df.name.value_counts()

Combine features

sig_by_gene = ((x_all_f.ix[:, :, 'p_val_ase'] < 0.005).sum(axis=1) / (x_all_f.ix[:, :, 'p_val_ase'].isnull() == False).sum(axis=1)) t = x_all_f.ix[:, :, 'major_allele_freq'] t = t - t.mean() t = t / t.std() df = xgenes_center_bt.to_dataframe() pos = pd.Series(df.start.values, index=df.name) features = pd.DataFrame({'significant':sig_by_gene, 'rep_timing':rt_by_gene, 'line1':line_one_by_gene, 'pos':pos, 'mean_freq_z':t.mean(axis=1)})

In [42]:
df = xgenes_center_bt.to_dataframe()
pos = pd.Series(df.start.values, index=df.name)
features = pd.DataFrame({'rep_timing':rt_by_gene, 'line1':line_one_by_gene, 'pos':pos,
                         'alu':alu_by_gene, 'line2':line_two_by_gene})
df = xgenes_center_bt.to_dataframe() df['score'] = features.ix[df.name, 'mean_freq_z'].values s = '\n'.join(df.chrom + '\t' + df.start.astype(str) + '\t' + df.end.astype(str) + '\t' + df.score.astype(str)) + '\n' bt = pbt.BedTool(s, from_string=True) mean_freq_z_bedgraph = os.path.join(outdir, 'mean_freq_z.bedGraph') bt.saveas(mean_freq_z_bedgraph);
df = xgenes_center_bt.to_dataframe() df['score'] = features.ix[df.name, 'significant'].values s = '\n'.join(df.chrom + '\t' + df.start.astype(str) + '\t' + df.end.astype(str) + '\t' + df.score.astype(str)) + '\n' bt = pbt.BedTool(s, from_string=True) sig_bedgraph = os.path.join(outdir, 'sig.bedGraph') bt.saveas(sig_bedgraph);

I downloaded the banding track from the table browser (group: Mapping and Sequencing, track: Chromosome Band, table: cytoBand) to cytoBand_db.txt in the output directory.


In [43]:
cyto = pd.read_table(os.path.join(outdir, 'cytoBand_db.txt'))
cyto.columns = [x.replace('#', '') for x in cyto.columns]

In [44]:
cyto.ix[(cyto.chrom == 'chrX') & (cyto.gieStain == 'acen')]


Out[44]:
chrom chromStart chromEnd name gieStain
469 chrX 58100000 60600000 p11.1 acen
470 chrX 60600000 63000000 q11.1 acen
t = features[features.pos < 58100000] print(stats.spearmanr(t.mean_freq_z, t.pos)) print(stats.spearmanr(t.significant, t.pos)) t = features[features.pos > 63000000] print(stats.spearmanr(t.mean_freq_z, t.pos)) print(stats.spearmanr(t.significant, t.pos))
%%R suppressPackageStartupMessages(library(Gviz))
%%R -i rt_bedgraph,sig_bedgraph,mean_freq_z_bedgraph,line_one_bam ideoTrack <- IdeogramTrack(genome = "hg19", chromosome = "chrX") # sig <- system.file(sig_bedgraph, package = "Gviz") sigTrack <- DataTrack(range=sig_bedgraph, genome="hg19", type=c("smooth", "p"), chromosome="chrX", name="Percent Significant ASE") # mfz <- system.file(sig_bedgraph, package = "Gviz") mfzTrack <- DataTrack(range=mean_freq_z_bedgraph, genome="hg19", type=c("smooth", "p"), chromosome="chrX", name="Mean Frequency z Score") #rt <- system.file(rt_bedgraph, package = "Gviz") rtTrack <- DataTrack(range=rt_bedgraph, genome="hg19", type="l", chromosome="chrX", name="Replication Timing") lineTrack <- DataTrack(range=line_one_bam, genome="hg19", type="l", window=-1, chromosome="chrX", name="L1 Elements")
%%R plotTracks(ideoTrack, from=0, to=58100000) plotTracks(rtTrack, from=0, to=58100000) plotTracks(lineTrack, from=0, to=58100000) plotTracks(mfzTrack, from=0, to=58100000) plotTracks(sigTrack, from=0, to=58100000)
%%R plotTracks(ideoTrack, from=63000000, to=155270560) plotTracks(rtTrack, from=63000000, to=155270560) plotTracks(lineTrack, from=0, to=58100000) plotTracks(mfzTrack, from=63000000, to=155270560) plotTracks(sigTrack, from=63000000, to=155270560)

In [45]:
%%R

suppressPackageStartupMessages(library(lme4))

In [46]:
lmm_features = pd.DataFrame(x_all.ix[:, :, 'major_allele_freq'].stack(), 
                            columns=['major_allele_freq'])
lmm_features.index.names = ['gene_id', 'sample_id']
lmm_features = lmm_features.reset_index()
t = pd.DataFrame(x_all.ix[:, :, 'p_val_ase'].stack(), 
                 columns=['p_val_ase'])
t.index.names = ['gene_id', 'sample_id']
t = t.reset_index()
lmm_features['p_val_ase'] = t['p_val_ase']
lmm_features = lmm_features.merge(features[['line1', 'alu', 'line2', 'pos', 'rep_timing']], 
                                  left_on='gene_id', right_index='True')

t = log_tpm_n.stack()
t = t.reset_index()
t.columns = ['gene_id', 'sample_id', 'exp']
t.index = t.gene_id + ':' + t.sample_id
lmm_features['exp'] = t.ix[lmm_features.gene_id + ':' + lmm_features.sample_id, 'exp'].values
lmm_features = lmm_features.dropna()

# random.seed('5454')
# rand = [random.random() for x in range(lmm_features.shape[0])]
# lmm_features['random'] = rand
random.seed('5454')
rand = [random.random() for x in set(lmm_features.gene_id)]
se = pd.Series(rand, index=set(lmm_features.gene_id))
lmm_features['random'] = se[lmm_features.gene_id].values

lmm_features_p = lmm_features[lmm_features.pos < 58100000]
lmm_features_q = lmm_features[lmm_features.pos > 63000000]

lmm_features.pos = lmm_features.pos - lmm_features.pos.min()
lmm_features.pos = lmm_features.pos  / lmm_features.pos.max()

lmm_features.line1 = lmm_features.line1 - lmm_features.line1.min()
lmm_features.line1 = lmm_features.line1  / lmm_features.line1.max()

lmm_features.alu = lmm_features.alu - lmm_features.alu.min()
lmm_features.alu = lmm_features.alu  / lmm_features.alu.max()

lmm_features.line2 = lmm_features.line2 - lmm_features.line2.min()
lmm_features.line2 = lmm_features.line2  / lmm_features.line2.max()

lmm_features_p = lmm_features.ix[lmm_features_p.index]
lmm_features_q = lmm_features.ix[lmm_features_q.index]

In [47]:
%%R -i lmm_features

model = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), data=lmm_features)
summary(model)


Linear mixed model fit by REML ['lmerMod']
Formula: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
   Data: lmm_features

REML criterion at convergence: -14002.9

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.5899 -0.8089 -0.1123  0.8129  2.5679 

Random effects:
 Groups    Name        Variance Std.Dev.
 sample_id (Intercept) 0.001855 0.04307 
 Residual              0.018213 0.13496 
Number of obs: 12308, groups:  sample_id, 144

Fixed effects:
             Estimate Std. Error t value
(Intercept)  0.683307   0.004877  140.11
exp         -0.022137   0.001346  -16.45
line1        0.094432   0.007343   12.86
pos          0.062206   0.003898   15.96
rep_timing  -0.012542   0.001599   -7.84

Correlation of Fixed Effects:
           (Intr) exp    line1  pos   
exp        -0.022                     
line1      -0.377  0.003              
pos        -0.345 -0.021 -0.167       
rep_timing -0.306  0.006 -0.086  0.157

In [48]:
%%R -i lmm_features_p

model = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), data=lmm_features_p)
summary(model)


Linear mixed model fit by REML ['lmerMod']
Formula: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
   Data: lmm_features_p

REML criterion at convergence: -7327.2

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-2.5157 -0.7581 -0.1261  0.7281  3.0087 

Random effects:
 Groups    Name        Variance Std.Dev.
 sample_id (Intercept) 0.001846 0.04296 
 Residual              0.014837 0.12181 
Number of obs: 5549, groups:  sample_id, 144

Fixed effects:
             Estimate Std. Error t value
(Intercept)  0.656776   0.005418  121.21
exp         -0.023410   0.001840  -12.72
line1       -0.102725   0.012214   -8.41
pos          0.612135   0.017927   34.15
rep_timing  -0.009300   0.002236   -4.16

Correlation of Fixed Effects:
           (Intr) exp    line1  pos   
exp        -0.016                     
line1      -0.278 -0.001              
pos        -0.286 -0.042 -0.480       
rep_timing -0.362  0.021 -0.264  0.214

In [49]:
%%R -i lmm_features_q

model = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), data=lmm_features_q)
summary(model)


Linear mixed model fit by REML ['lmerMod']
Formula: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
   Data: lmm_features_q

REML criterion at convergence: -7487.2

Scaled residuals: 
     Min       1Q   Median       3Q      Max 
-2.56268 -0.81750 -0.05648  0.83158  2.51166 

Random effects:
 Groups    Name        Variance Std.Dev.
 sample_id (Intercept) 0.002052 0.04529 
 Residual              0.018494 0.13599 
Number of obs: 6757, groups:  sample_id, 144

Fixed effects:
             Estimate Std. Error t value
(Intercept)  0.715053   0.010201   70.09
exp         -0.022906   0.001810  -12.66
line1        0.106992   0.009705   11.02
pos         -0.000799   0.011018   -0.07
rep_timing  -0.002976   0.002201   -1.35

Correlation of Fixed Effects:
           (Intr) exp    line1  pos   
exp        -0.017                     
line1      -0.464  0.016              
pos        -0.845 -0.007  0.191       
rep_timing -0.073 -0.005 -0.038 -0.096

Whole chromosome

LINE 1 elements

In [50]:
%%R

model.full = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), 
                  data=lmm_features, REML=FALSE)
model.null = lmer(major_allele_freq ~ exp + pos + rep_timing + (1|sample_id), 
                  data=lmm_features, REML=FALSE)
anova(model.null, model.full)


Data: lmm_features
Models:
model.null: major_allele_freq ~ exp + pos + rep_timing + (1 | sample_id)
model.full: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
           Df    AIC    BIC logLik deviance Chisq Chi Df Pr(>Chisq)    
model.null  6 -13876 -13831 6943.8   -13888                            
model.full  7 -14038 -13986 7026.0   -14052 164.3      1  < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Position

In [51]:
%%R

model.full = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), 
                  data=lmm_features, REML=FALSE)
model.null = lmer(major_allele_freq ~ exp + line1 + rep_timing + (1|sample_id), 
                  data=lmm_features, REML=FALSE)
anova(model.null, model.full)


Data: lmm_features
Models:
model.null: major_allele_freq ~ exp + line1 + rep_timing + (1 | sample_id)
model.full: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
           Df    AIC    BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
model.null  6 -13788 -13743 6899.9   -13800                             
model.full  7 -14038 -13986 7026.0   -14052 252.19      1  < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Replicating timing

In [52]:
%%R

model.full = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), 
                  data=lmm_features, REML=FALSE)
model.null = lmer(major_allele_freq ~ exp + line1 + pos + (1|sample_id), 
                  data=lmm_features, REML=FALSE)
anova(model.null, model.full)


Data: lmm_features
Models:
model.null: major_allele_freq ~ exp + line1 + pos + (1 | sample_id)
model.full: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
           Df    AIC    BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
model.null  6 -13978 -13934 6995.3   -13990                             
model.full  7 -14038 -13986 7026.0   -14052 61.395      1  4.669e-15 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

p arm

LINE 1 elements

In [53]:
%%R -i lmm_features_p

model.full = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), 
                  data=lmm_features_p, REML=FALSE)
model.null = lmer(major_allele_freq ~ exp + pos + rep_timing + (1|sample_id), 
                  data=lmm_features_p, REML=FALSE)
anova(model.null, model.full)


Data: lmm_features_p
Models:
model.null: major_allele_freq ~ exp + pos + rep_timing + (1 | sample_id)
model.full: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
           Df     AIC     BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
model.null  6 -7288.8 -7249.1 3650.4  -7300.8                             
model.full  7 -7357.1 -7310.7 3685.5  -7371.1 70.286      1  < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Position

In [54]:
%%R -i lmm_features_p

model.full = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), 
                  data=lmm_features_p, REML=FALSE)
model.null = lmer(major_allele_freq ~ exp + line1 + rep_timing + (1|sample_id), 
                  data=lmm_features_p, REML=FALSE)
anova(model.null, model.full)


Data: lmm_features_p
Models:
model.null: major_allele_freq ~ exp + line1 + rep_timing + (1 | sample_id)
model.full: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
           Df     AIC     BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
model.null  6 -6303.5 -6263.7 3157.7  -6315.5                             
model.full  7 -7357.1 -7310.7 3685.5  -7371.1 1055.6      1  < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Replication timing

In [55]:
%%R -i lmm_features_p

model.full = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), 
                  data=lmm_features_p, REML=FALSE)
model.null = lmer(major_allele_freq ~ exp + line1 + pos + (1|sample_id), 
                  data=lmm_features_p, REML=FALSE)
anova(model.null, model.full)


Data: lmm_features_p
Models:
model.null: major_allele_freq ~ exp + line1 + pos + (1 | sample_id)
model.full: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
           Df     AIC     BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
model.null  6 -7341.8 -7302.1 3676.9  -7353.8                             
model.full  7 -7357.1 -7310.7 3685.5  -7371.1 17.287      1  3.214e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

q arm

LINE 1 elements

In [56]:
%%R -i lmm_features_q

model.full = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), 
                  data=lmm_features_q, REML=FALSE)
model.null = lmer(major_allele_freq ~ exp + pos + rep_timing + (1|sample_id), 
                  data=lmm_features_q, REML=FALSE)
anova(model.null, model.full)


Data: lmm_features_q
Models:
model.null: major_allele_freq ~ exp + pos + rep_timing + (1 | sample_id)
model.full: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
           Df     AIC     BIC logLik deviance  Chisq Chi Df Pr(>Chisq)    
model.null  6 -7399.7 -7358.7 3705.8  -7411.7                             
model.full  7 -7518.2 -7470.4 3766.1  -7532.2 120.51      1  < 2.2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Position

In [57]:
%%R -i lmm_features_q

model.full = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), 
                  data=lmm_features_q, REML=FALSE)
model.null = lmer(major_allele_freq ~ exp + line1 + rep_timing + (1|sample_id), 
                  data=lmm_features_q, REML=FALSE)
anova(model.null, model.full)


Data: lmm_features_q
Models:
model.null: major_allele_freq ~ exp + line1 + rep_timing + (1 | sample_id)
model.full: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
           Df     AIC     BIC logLik deviance  Chisq Chi Df Pr(>Chisq)
model.null  6 -7520.2 -7479.3 3766.1  -7532.2                         
model.full  7 -7518.2 -7470.4 3766.1  -7532.2 0.0051      1     0.9429
Replication timing

In [58]:
%%R -i lmm_features_q

model.full = lmer(major_allele_freq ~ exp + line1 + pos + rep_timing + (1|sample_id), 
                  data=lmm_features_q, REML=FALSE)
model.null = lmer(major_allele_freq ~ exp + line1 + pos + (1|sample_id), 
                  data=lmm_features_q, REML=FALSE)
anova(model.null, model.full)


Data: lmm_features_q
Models:
model.null: major_allele_freq ~ exp + line1 + pos + (1 | sample_id)
model.full: major_allele_freq ~ exp + line1 + pos + rep_timing + (1 | sample_id)
           Df     AIC     BIC logLik deviance  Chisq Chi Df Pr(>Chisq)
model.null  6 -7518.3 -7477.4 3765.2  -7530.3                         
model.full  7 -7518.2 -7470.4 3766.1  -7532.2 1.8285      1     0.1763

In [59]:
rt_data = rt[['Chromosome', 'Start_Position', 'End_Position', 'Data_Value']]
rt_data.columns = ['chromosome', 'start', 'end', 'Replication Timing']

In [60]:
%%R

suppressPackageStartupMessages(library(Gviz))

In [61]:
%%R -i rt_data

ideoTrack <- IdeogramTrack(genome = "hg19", chromosome = "chrX")
rtTrack <- DataTrack(range=rt_data, genome="hg19", type=c("polygon"),
                     chromosome="chrX", name="Replication Timing")
%%R -i line_one_bam #bamFile <- system.file(line_one_bam, package = "Gviz") lineTrack <- DataTrack(range=line_one_bam, genome="hg19", type="l", window=-1, windowSize=1000000, chromosome="chrX", name="L1 Elements", )

In [62]:
%%R -i line_one_bw

#bamFile <- system.file(line_one_bam, package = "Gviz")
lineTrack <- DataTrack(range=line_one_bw, genome="hg19", type="l", window=-1,
                       chromosome="chrX", name="L1 Elements", )

In [64]:
lmm_features = lmm_features.merge(gene_info[['chrom', 'start', 'end']], 
                                  left_on='gene_id', right_index=True)
lmm_features.columns = [c.replace('chrom', 'chromosome') for c in lmm_features.columns]

In [111]:
t = x_all.ix[:, :, 'major_allele_freq']
r = gene_info.ix[t.index, ['start', 'end']]

In [66]:
%%R -i t,r

mafTrack <- DataTrack(range=r, data=t, genome="hg19", type=c("smooth", "p"), alpha=0.75, lwd=8,
                      span=0.05,
                      chromosome="chrX", name="Major Allele Frequency")

In [67]:
%%R

plotTracks(c(ideoTrack, rtTrack, mafTrack), from=63000000, to=155270560)



In [70]:
%%R

plotTracks(c(ideoTrack, rtTrack, lineTrack, mafTrack), from=63000000, to=155270560)



In [76]:
%%R

plotTracks(c(ideoTrack, rtTrack, lineTrack, mafTrack), from=0, to=58100000)



In [226]:
%%R

plotTracks(c(ideoTrack, rtTrack, lineTrack, mafTrack), from=0, to=58100000)



In [ ]:
%%R -i rt_bedgraph,sig_bedgraph,mean_freq_z_bedgraph,line_one_bam

ideoTrack <- IdeogramTrack(genome = "hg19", chromosome = "chrX")
# sig <- system.file(sig_bedgraph, package = "Gviz")
sigTrack <- DataTrack(range=sig_bedgraph, genome="hg19", type=c("smooth", "p"),
                      chromosome="chrX", name="Percent Significant ASE")
# mfz <- system.file(sig_bedgraph, package = "Gviz")
mfzTrack <- DataTrack(range=mean_freq_z_bedgraph, genome="hg19", type=c("smooth", "p"),
                      chromosome="chrX", name="Mean Frequency z Score")
#rt <- system.file(rt_bedgraph, package = "Gviz")
rtTrack <- DataTrack(range=rt_bedgraph, genome="hg19", type="l",
                     chromosome="chrX", name="Replication Timing")
lineTrack <- DataTrack(range=line_one_bam, genome="hg19", type="l", window=-1,
                       chromosome="chrX", name="L1 Elements")

In [ ]:


In [ ]:


In [ ]:

tm = meta.ix[t.columns] tm = tm.sort_values(by='in_eqtl', ascending=False) tm = tm.drop_duplicates(subset='subject_id') t = locus_p.ix[tm.index, :, 'p_val_ase'] t.index.name = 'gene_id' t = t.ix[set(t.index) & set(gene_info[gene_info.chrom == 'chrX'].index)] features = pd.DataFrame(t.stack(), columns=['pvalue']) t = locus_p.ix[tm.index, :, 'major_allele_freq'] t.index.name = 'gene_id' t = t.ix[set(t.index) & set(gene_info[gene_info.chrom == 'chrX'].index)] tt = pd.DataFrame(t.stack(), columns=['major_allele_freq']) features.ix[tt.index, 'major_allele_freq'] = tt.major_allele_freq features = features.reset_index() features['sig'] = features.pvalue < 0.005 features['pos'] = (gene_info.ix[features.gene_id, ['start', 'end']].sum(axis=1) / 2).values features['pos'] = features['pos'] - features['pos'].min() features['intercept'] = 1
rt = pd.read_table('/raid3/publicdata/replication_domain_db/RD_sm300_2936763_hFibiPS4p72.hg19.txt', low_memory=False, skiprows=15, index_col=0) rt = rt[rt.Chromosome == 'chrX'] s = '\n'.join(rt.Chromosome + '\t' + rt.Start_Position.astype(str) + '\t' + rt.End_Position.astype(str) + '\t' + rt.Data_Value.astype(str)) + '\n' rt_bt = pbt.BedTool(s, from_string=True) rt_bt = rt_bt.sort() res = genes.intersect(rt_bt, sorted=True, wo=True) res = res.to_dataframe() gb = res[['name', 'blockCount']].groupby('name').mean() gb.columns = ['replication_timing'] res = genes.closest(rt_bt, sorted=True) res = res.to_dataframe() res = res[res.chrom == 'chrX'] res['blockCount'] = res['blockCount'].astype(float) gb2 = res[['name', 'blockCount']].groupby('name').mean() gb2.columns = ['replication_timing'] gb2 = gb2.drop(gb.index) rt_x = pd.concat([gb, gb2]) features['replication_timing'] = rt_x.ix[features.gene_id, 'replication_timing'].values
d = pd.get_dummies(features['id']) f = features[['replication_timing', 'intercept']].join(d.ix[:, 1:]) logit = sm.Logit(endog=features.sig, exog=f) logit_result = logit.fit() logit_result.summary()
d = pd.get_dummies(features['id']) f = features[['replication_timing', 'pos', 'intercept']].join(d.ix[:, 1:]) logit = sm.Logit(endog=features.sig, exog=f) logit_result = logit.fit() logit_result.summary()
keys = sorted(set(features.pos)) se = pd.Series(range(len(keys)), index=keys) features['pos_norm'] = se[features.pos].values
features.head()
sns.jointplot(x='pos_norm', y='major_allele_freq', data=features, kind='hex');
sns.jointplot(x='pos_norm', y='replication_timing', data=features, kind='hex');
rt = pd.read_table('/raid3/publicdata/replication_domain_db/RD_sm300_2936763_hFibiPS4p72.hg19.txt', low_memory=False, skiprows=15, index_col=0) rt = rt[rt.Chromosome == 'chrX']
rt.head()
(rt.Start_Position.max() - rt.Start_Position.min()) / 10000
rt.Start_Position[1000] - rt.Start_Position[0]
rt.shape
rt.Data_Value.head(1000).plot(kind='bar')
t.head()

Reactivation QTLs


In [34]:
g = set(gene_info[gene_info.chrom == 'chrX'].index) & set(ase_pval.index)
s = meta[(meta.sex == 'F') & (meta.in_eqtl)].index
fx_maj_af = maj_af.ix[g, s]
fx_maj_af_f = fx_maj_af[(fx_maj_af.isnull() == False).sum(axis=1) >= 40]

In [35]:
fx_maj_af_f.shape


Out[35]:
(108, 116)

In [36]:
g = set(gene_info[gene_info.chrom == 'chrX'].index) & set(ase_pval.index)
s = meta[(meta.sex == 'F') & (meta.in_eqtl)].index
fx_maj_af = maj_af.ix[g, s]
fx_maj_af_f = fx_maj_af[(fx_maj_af.isnull() == False).sum(axis=1) >= 40]

In [37]:
cpy.makedir(os.path.join(outdir, 'inact_qtl'))
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 'gene_to_regions.p')
gene_to_regions = cPickle.load(open(fn, 'rb'))

In [38]:
fx_maj_af_f.mean().hist()
plt.ylabel('Number of samples')
plt.xlabel('Mean major allele frequency');


I'm going to substract the mean from each sample (column) to account for differences in overall reactivation.


In [39]:
fx_maj_af_f = fx_maj_af_f - fx_maj_af_f.mean()

In [44]:
fx_maj_af_f.shape


Out[44]:
(108, 116)

In [40]:
def run_emmax_sge(gene_id, mem=4):
    """Run EMMAX for X inactivation eQTL."""
    se = fx_maj_af_f.ix[gene_id].dropna()
    se = cpb.general.transform_standard_normal(se)
    wgs = cpy.get_best_wgs_sample(meta.ix[se.index, 'subject_id'])
    se.index = wgs.wgs_id
    se = se[sorted(se.index)]
    
    toutdir = os.path.join(outdir, 'inact_qtl', gene_id)
    cpy.makedir(toutdir)
    
    samples = os.path.join(toutdir, 'emmax_samples.tsv')
    with open(samples, 'w') as f:
        f.write('\n'.join(se.index) + '\n')
    
    exp = os.path.join(toutdir, 'maj_af_std_norm.tsv')
    pd.DataFrame(se, columns=[gene_id]).T.to_csv(exp, sep='\t')
    
    vcf = '/projects/CARDIPS/pipeline/WGS/mergedVCF/CARDIPS_201512.femaleX.PASS.vcf.gz'
    regions = ','.join([x[3:] for x in gene_to_regions[gene_id]])
    kin = os.path.join(ciepy.root, 'output', 'eqtl_input', 'wgs.kin')

    res = datetime.datetime.now()
    date = re.sub(r'\D', '_', str(res))
    fn = os.path.join(toutdir, '{}_{}.sh'.format(gene_id, date))
    with open(fn, 'w') as f:
        f.write('#!/bin/bash\n\n')
        f.write('#$ -N emmax_{}_{}_x\n'.format(gene_id, date))
        num_threads = 4
        f.write('#$ -l short\n')
        f.write('#$ -l h_vmem={}G\n'.format(mem / num_threads))
        f.write('#$ -pe smp {}\n'.format(num_threads))
        f.write('#$ -S /bin/bash\n')
        f.write('#$ -o {}/emmax_{}_{}_x.out\n'.format(toutdir, gene_id, date))
        f.write('#$ -e {}/emmax_{}_{}_x.err\n\n'.format(toutdir, gene_id, date))
        f.write('module load cardips/1\n')
        f.write('source activate cie\n\n')
        
        cpy.makedir(toutdir)
        c = 'python {} \\\n\t'.format(os.path.join(ciepy.root, 'scripts', 'run_emmax.py'))
        c += ' \\\n\t'.join([
                gene_id,
                vcf,
                regions,
                exp,
                samples,
                kin,
                toutdir,
            ])
        f.write(c + '\n\n')
    subprocess.check_call('qsub {}'.format(fn), shell=True)

In [42]:
if not os.path.exists(os.path.join(outdir, 'inact_qtl')):
    for g in fx_maj_af_f.index:
        run_emmax_sge(g)

In [45]:
dys = glob.glob(os.path.join(outdir, 'inact_qtl', '*'))
gene_ids = []
pvalues = []
for dy in dys:
    gene_id = os.path.split(dy)[1]
    res_fn = os.path.join(os.path.join(dy, '{}.tsv'.format(gene_id)))
    res = ciepy.read_emmax_output(res_fn)
    min_fn = os.path.join(os.path.join(dy, 'minimum_pvalues.tsv'))
    min_pvals = pd.read_table(min_fn, header=None, squeeze=True)
    pvalues.append((1 + sum(min_pvals <= res.PVALUE.min())) / float(min_pvals.shape[0] + 1))
    gene_ids.append(gene_id)
pvalues = pd.Series(pvalues, index=gene_ids)

In [46]:
pvalues.hist()
plt.xlabel('$p$-value')
plt.ylabel('Number of genes');



In [47]:
(-np.log10(pvalues)).hist()
plt.xlabel('$-\log_{10} p$-value')
plt.ylabel('Number of genes');



In [48]:
r = sms.sandbox.stats.multicomp.multipletests(pvalues, method='fdr_bh')
pvalues_bh = pd.Series(r[1], index=pvalues.index)
sum(pvalues_bh < 0.05)


Out[48]:
0

In [49]:
pvalues = pvalues.sort_values()

The following genes are nominally significant but don't pass the FDR correction.


In [50]:
gene_info.ix[pvalues[pvalues < 0.05].index]


Out[50]:
gene_name gene_type chrom start end strand gene_status source level
ENSG00000238178.2 RP11-431J24.2 antisense chrX 16171063 16188992 - NOVEL HAVANA 2
ENSG00000102010.10 BMX protein_coding chrX 15482368 15574652 + KNOWN HAVANA 2
ENSG00000131725.9 WDR44 protein_coding chrX 117480035 117583924 + KNOWN HAVANA 2
ENSG00000204466.5 DGKK processed_transcript chrX 50108407 50213737 - KNOWN HAVANA 2
ENSG00000087842.6 PIR protein_coding chrX 15402920 15511687 - KNOWN HAVANA 2
ENSG00000068400.9 GRIPAP1 protein_coding chrX 48830133 48858675 - KNOWN HAVANA 2
ENSG00000130741.6 EIF2S3 protein_coding chrX 24072832 24096088 + KNOWN HAVANA 2